#Classic, Data Manipulation
import pandas as pd
import numpy as np
#Plots
import matplotlib.pyplot as plt
import seaborn as sns
#Data processing, metrics and modeling
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score, precision_recall_curve,precision_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
# to display Image files
from PIL import Image as PILImage
#ignore warning messages
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('health care diabetes.csv')
df.head()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 148 | 72 | 35 | 0 | 33.6 | 0.627 | 50 | 1 |
| 1 | 1 | 85 | 66 | 29 | 0 | 26.6 | 0.351 | 31 | 0 |
| 2 | 8 | 183 | 64 | 0 | 0 | 23.3 | 0.672 | 32 | 1 |
| 3 | 1 | 89 | 66 | 23 | 94 | 28.1 | 0.167 | 21 | 0 |
| 4 | 0 | 137 | 40 | 35 | 168 | 43.1 | 2.288 | 33 | 1 |
df.shape
(768, 9)
df.describe()
| Pregnancies | Glucose | BloodPressure | SkinThickness | Insulin | BMI | DiabetesPedigreeFunction | Age | Outcome | |
|---|---|---|---|---|---|---|---|---|---|
| count | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 | 768.000000 |
| mean | 3.845052 | 120.894531 | 69.105469 | 20.536458 | 79.799479 | 31.992578 | 0.471876 | 33.240885 | 0.348958 |
| std | 3.369578 | 31.972618 | 19.355807 | 15.952218 | 115.244002 | 7.884160 | 0.331329 | 11.760232 | 0.476951 |
| min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.078000 | 21.000000 | 0.000000 |
| 25% | 1.000000 | 99.000000 | 62.000000 | 0.000000 | 0.000000 | 27.300000 | 0.243750 | 24.000000 | 0.000000 |
| 50% | 3.000000 | 117.000000 | 72.000000 | 23.000000 | 30.500000 | 32.000000 | 0.372500 | 29.000000 | 0.000000 |
| 75% | 6.000000 | 140.250000 | 80.000000 | 32.000000 | 127.250000 | 36.600000 | 0.626250 | 41.000000 | 1.000000 |
| max | 17.000000 | 199.000000 | 122.000000 | 99.000000 | 846.000000 | 67.100000 | 2.420000 | 81.000000 | 1.000000 |
df.columns
Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
dtype='object')
for i in df.columns[1:-1]:
l = len(df[df[i]==0])
if l>=1:
print(i,'---- has total {} Zero values'.format(l))
else:
print(i,'---- has no zero values and is good to go')
Glucose ---- has total 5 Zero values BloodPressure ---- has total 35 Zero values SkinThickness ---- has total 227 Zero values Insulin ---- has total 374 Zero values BMI ---- has total 11 Zero values DiabetesPedigreeFunction ---- has no zero values and is good to go Age ---- has no zero values and is good to go
A person can not have zero values for Glucose, Bloodpressure, SkinThickness, Insulin, BMI and Diabetes Pedigress Function. All these zero values don't make any sense hence these are nothing but the missing values. So we'll treat them with missing values imputation techniques
df_copy = df.copy(deep = True)
df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']]=df_copy[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)
df_copy.isnull().sum()
Pregnancies 0 Glucose 5 BloodPressure 35 SkinThickness 227 Insulin 374 BMI 11 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
Let's see the distribution of data points in order to fill the null values.
sns.pairplot(df)
<seaborn.axisgrid.PairGrid at 0x2515176b1f0>
Filling Missing Values
df_copy.isnull().sum()
Pregnancies 0 Glucose 5 BloodPressure 35 SkinThickness 227 Insulin 374 BMI 11 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
#Function to find median
def median_imp(var):
med_df = df_copy[df_copy[var].notnull()]
med_df = med_df[[var, 'Outcome']].groupby(['Outcome'])[[var]].median().reset_index()
return med_df
median_imp('Glucose')
| Outcome | Glucose | |
|---|---|---|
| 0 | 0 | 107.0 |
| 1 | 1 | 140.0 |
df_copy.loc[(df_copy['Outcome']==0) & df_copy['Glucose'].isnull(),'Glucose'] = 107
df_copy.loc[(df_copy['Outcome']==1) & df_copy['Glucose'].isnull(),'Glucose'] = 140
median_imp('BloodPressure')
| Outcome | BloodPressure | |
|---|---|---|
| 0 | 0 | 70.0 |
| 1 | 1 | 74.5 |
df_copy.loc[(df_copy['Outcome']==0) & df_copy['BloodPressure'].isnull(),'BloodPressure'] = 70
df_copy.loc[(df_copy['Outcome']==1) & df_copy['BloodPressure'].isnull(),'BloodPressure'] = 74.5
median_imp('SkinThickness')
| Outcome | SkinThickness | |
|---|---|---|
| 0 | 0 | 27.0 |
| 1 | 1 | 32.0 |
df_copy.loc[(df_copy['Outcome']==0) & df_copy['SkinThickness'].isnull(),'SkinThickness'] = 27
df_copy.loc[(df_copy['Outcome']==1) & df_copy['SkinThickness'].isnull(),'SkinThickness'] = 32
median_imp('Insulin')
| Outcome | Insulin | |
|---|---|---|
| 0 | 0 | 102.5 |
| 1 | 1 | 169.5 |
df_copy.loc[(df_copy['Outcome']==0) & df_copy['Insulin'].isnull(),'Insulin'] = 102.5
df_copy.loc[(df_copy['Outcome']==1) & df_copy['Insulin'].isnull(),'Insulin'] = 169.5
median_imp('BMI')
| Outcome | BMI | |
|---|---|---|
| 0 | 0 | 30.1 |
| 1 | 1 | 34.3 |
df_copy.loc[(df_copy['Outcome']==0) & df_copy['BMI'].isnull(),'BMI'] = 30.1
df_copy.loc[(df_copy['Outcome']==1) & df_copy['BMI'].isnull(),'BMI'] = 34.3
df_copy.isnull().sum()
Pregnancies 0 Glucose 0 BloodPressure 0 SkinThickness 0 Insulin 0 BMI 0 DiabetesPedigreeFunction 0 Age 0 Outcome 0 dtype: int64
Now our dataset is free from any null values so we can proceed further
Pair Plot after handeling missing values
sns.pairplot(df_copy)
<seaborn.axisgrid.PairGrid at 0x25154c6bc40>
Count of types of columns in dataset
int_dtype = df.select_dtypes(include=['int64']).columns
float_dtype = df.select_dtypes(include=['float64']).columns
obj_dtype = df.select_dtypes(include=['object']).columns
print('No of integer columns in dataframe is :',len(int_dtype))
print('No of flaot columns in dataframe is :',len(float_dtype))
print('No of object columns in dataframe is :',len(obj_dtype))
No of integer columns in dataframe is : 7 No of flaot columns in dataframe is : 2 No of object columns in dataframe is : 0
sns.countplot(x = df.dtypes.map(str),palette='Set2')
<AxesSubplot:ylabel='count'>
Count of diabetic and healty people in dataset
df.Outcome.value_counts()
0 500 1 268 Name: Outcome, dtype: int64
diab_count = df.Outcome.astype('category').cat.rename_categories(['Healthy','Diabetic'])
sns.countplot(x= diab_count)
<AxesSubplot:xlabel='Outcome', ylabel='count'>
sns.pairplot(df_copy,hue = 'Outcome')
<seaborn.axisgrid.PairGrid at 0x2515ae12580>
Heatmap of Original Dataset
plt.figure(figsize=(12,10))
sns.heatmap(df.corr(),annot=True,cmap = 'viridis')
<AxesSubplot:>
Heatmap of Clean Data
plt.figure(figsize=(12,10))
sns.heatmap(df_copy.corr(),annot=True,cmap = 'viridis')
<AxesSubplot:>
From the above heatmap we see a bit of correlation between some columns i.e.
Age and Pregnancies = 0.54 Glucose and insulin = 0.49 SkinThickness and BMI = 0.57 Let's create some scatter plots for above mentioned column pairs to understand the relationship among the top correlation values:
def sctr_plot(var1,var2):
sns.scatterplot(x = var1,y = var2, data = df_copy,hue = 'Outcome',marker = 'D')
sctr_plot('Age','Pregnancies')
sctr_plot('Glucose','Insulin')
sctr_plot('SkinThickness','BMI')
Data Split for training and testing
X = df_copy.drop('Outcome',axis=1)
y=df_copy.Outcome
X.shape,y.shape
((768, 8), (768,))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42,stratify = y)
X_train.shape,X_test.shape,y_train.shape,y_test.shape
((537, 8), (231, 8), (537,), (231,))
Standardization To bring the whole data at a same scale we'll perform satnadardization.
sc = StandardScaler()
X_train_scaled = sc.fit_transform(X_train)
X_test_scaled = sc.transform(X_test)
Log_model = LogisticRegression(max_iter=10000)
Log_model.fit(X_train_scaled,y_train)
LogisticRegression(max_iter=10000)
log_pred = Log_model.predict(X_test_scaled)
print(classification_report(y_test,log_pred))
precision recall f1-score support
0 0.79 0.84 0.81 150
1 0.66 0.58 0.62 81
accuracy 0.75 231
macro avg 0.72 0.71 0.72 231
weighted avg 0.74 0.75 0.74 231
print(confusion_matrix(y_test,log_pred))
print('\n','Accuracy - ',accuracy_score(y_test,log_pred))
[[126 24] [ 34 47]] Accuracy - 0.7489177489177489
rfc = RandomForestClassifier()
n_estimators = [75,100,125,150,200]
max_features = [4,5,6,7,8]
bootstrap=[True,False]
oob_score=[True,False]
param_grid = {'n_estimators':n_estimators,
'max_features':max_features,
'bootstrap':bootstrap,
'oob_score':oob_score}
grid = GridSearchCV(rfc,param_grid)
grid.fit(X_train_scaled,y_train)
GridSearchCV(estimator=RandomForestClassifier(),
param_grid={'bootstrap': [True, False],
'max_features': [4, 5, 6, 7, 8],
'n_estimators': [75, 100, 125, 150, 200],
'oob_score': [True, False]})
rfc_pred = grid.predict(X_test_scaled)
print(classification_report(y_test,rfc_pred))
precision recall f1-score support
0 0.88 0.91 0.90 150
1 0.83 0.78 0.80 81
accuracy 0.87 231
macro avg 0.86 0.85 0.85 231
weighted avg 0.86 0.87 0.86 231
print(confusion_matrix(y_test,log_pred))
print('\n','Accuracy - ',accuracy_score(y_test,rfc_pred))
[[126 24] [ 34 47]] Accuracy - 0.8658008658008658
grid.best_params_
{'bootstrap': True, 'max_features': 4, 'n_estimators': 75, 'oob_score': False}
svc = SVC()
param_grid = {'C':[0.01,0.1,1,10],'kernel':['linear', 'poly', 'rbf', 'sigmoid']}
grid = GridSearchCV(svc,param_grid)
grid.fit(X_train_scaled,y_train)
GridSearchCV(estimator=SVC(),
param_grid={'C': [0.01, 0.1, 1, 10],
'kernel': ['linear', 'poly', 'rbf', 'sigmoid']})
svc_pred = grid.predict(X_test_scaled)
print(classification_report(y_test,svc_pred))
precision recall f1-score support
0 0.85 0.87 0.86 150
1 0.75 0.70 0.73 81
accuracy 0.81 231
macro avg 0.80 0.79 0.79 231
weighted avg 0.81 0.81 0.81 231
print(confusion_matrix(y_test,log_pred))
print('\n','Accuracy - ',accuracy_score(y_test,svc_pred))
[[126 24] [ 34 47]] Accuracy - 0.8138528138528138
grid.best_params_
{'C': 1, 'kernel': 'rbf'}
train_score = []
test_score = []
for i in range (1,20):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train_scaled,y_train)
# knn_pred = knn.predict(X_test)
train_score.append(knn.score(X_train_scaled,y_train))
test_score.append(knn.score(X_test_scaled,y_test))
sns.lineplot(x = range(1,20),y = train_score,marker='o')
sns.lineplot(x = range(1,20),y = test_score,marker = 'o')
<AxesSubplot:>
acc_score = []
for i in range (1,10):
knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train_scaled,y_train)
knn_pred = knn.predict(X_test_scaled)
acc_score.append(accuracy_score(y_test,knn_pred))
print(max(acc_score))
sns.lineplot(x = range(1,10),y = acc_score,marker='o')
0.8398268398268398
<AxesSubplot:>
From above results this could be concluded that n=7 gives the best results so we'll take n_neighbors = 7 for final model
final_knn_model = KNeighborsClassifier(n_neighbors=7)
final_knn_model.fit(X_train_scaled,y_train)
KNeighborsClassifier(n_neighbors=7)
knn_pred = final_knn_model.predict(X_test_scaled)
print(accuracy_score(y_test,knn_pred))
0.8398268398268398
print(classification_report(y_test,knn_pred))
precision recall f1-score support
0 0.87 0.89 0.88 150
1 0.78 0.75 0.77 81
accuracy 0.84 231
macro avg 0.83 0.82 0.82 231
weighted avg 0.84 0.84 0.84 231
print(confusion_matrix(y_test,knn_pred))
[[133 17] [ 20 61]]
dt = DecisionTreeClassifier(random_state=42)
param_grid = {'criterion' : ["gini", "entropy"],
'min_samples_split' : [2,3,4,5],
'max_features' : [4,5,6,7,8]
}
grid_dt = GridSearchCV(dt,param_grid)
grid_dt.fit(X_train_scaled,y_train)
GridSearchCV(estimator=DecisionTreeClassifier(random_state=42),
param_grid={'criterion': ['gini', 'entropy'],
'max_features': [4, 5, 6, 7, 8],
'min_samples_split': [2, 3, 4, 5]})
grid_dt.best_params_
{'criterion': 'entropy', 'max_features': 6, 'min_samples_split': 3}
dt_pred = grid_dt.predict(X_test_scaled)
print(accuracy_score(y_test,dt_pred))
0.8614718614718615
print(classification_report(y_test,dt_pred))
precision recall f1-score support
0 0.87 0.92 0.90 150
1 0.84 0.75 0.79 81
accuracy 0.86 231
macro avg 0.85 0.84 0.84 231
weighted avg 0.86 0.86 0.86 231
dt_cm = confusion_matrix(y_test,dt_pred)
print(confusion_matrix(y_test,dt_pred))
[[138 12] [ 20 61]]
grid_dt.best_params_
{'criterion': 'entropy', 'max_features': 6, 'min_samples_split': 3}
In a Nutshell Accuracy , Sensitivity and Specificity
models = [
{
'label': 'Logistic Regression',
'model': LogisticRegression(),
},
{
'label': 'KNeighbors Classifier',
'model': KNeighborsClassifier(n_neighbors=7),
},
{
'label' : 'Support Vector Classifier',
'model' : SVC(C= 1, kernel='rbf',probability=True),
},
{
'label' : 'Decision Tress',
'model' : DecisionTreeClassifier(random_state=42,criterion= 'entropy', max_features= 6, min_samples_split= 3),
},
{
'label' : 'Random Forest Classifier',
'model' : RandomForestClassifier(bootstrap= True, max_features= 6, n_estimators= 100,oob_score= True),
},
]
accu = []
model_name= []
sensitivity = []
specificity = []
for m in models:
model1 = m['model']
model1.fit(X_train_scaled, y_train) # train the model
pred = model1.predict(X_test_scaled) # predict the test data
cm = confusion_matrix(y_test,pred)
accu.append(accuracy_score(y_test,pred))
model_name.append(m['label'])
# sensitivity.append(cm[0,0]/(cm[0,0]+cm[0,1])
# specificity.append(cm[1,1]/(cm[1,0]+cm[1,1])
models_accuracy= pd.DataFrame(data=accu,index = model_name,columns=['Accuracy Score'] )
models_accuracy
| Accuracy Score | |
|---|---|
| Logistic Regression | 0.748918 |
| KNeighbors Classifier | 0.839827 |
| Support Vector Classifier | 0.813853 |
| Decision Tress | 0.861472 |
| Random Forest Classifier | 0.883117 |
accu = []
model_name= []
sensitivity = []
specificity = []
for m in models:
model1 = m['model']
model1.fit(X_train_scaled, y_train) # train the model
pred = model1.predict(X_test_scaled) # predict the test data
cm = confusion_matrix(y_test,pred)
accu.append(accuracy_score(y_test,pred))
model_name.append(m['label'])
sensitivity.append(cm[0,0]/(cm[0,0]+cm[0,1]))
specificity.append(cm[1,1]/(cm[1,0]+cm[1,1]))
models_accu_sen_sp= pd.DataFrame(data=(accu,sensitivity,specificity),index = ['Accuracy','Sensitivity','Specificity'],
columns=[model_name]).T
models_accu_sen_sp
| Accuracy | Sensitivity | Specificity | |
|---|---|---|---|
| Logistic Regression | 0.748918 | 0.840000 | 0.580247 |
| KNeighbors Classifier | 0.839827 | 0.886667 | 0.753086 |
| Support Vector Classifier | 0.813853 | 0.873333 | 0.703704 |
| Decision Tress | 0.861472 | 0.920000 | 0.753086 |
| Random Forest Classifier | 0.874459 | 0.920000 | 0.790123 |
Combined ROC Curve for all the models
# Below for loop iterates through your models list
for m in models:
model = m['model'] # select the model
model.fit(X_train_scaled, y_train) # train the model
y_pred=model.predict(X_test_scaled) # predict the test data
# Compute False postive rate, and True positive rate
fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(X_test_scaled)[:,1])
# Calculate Area under the curve to display on the plot
auc = roc_auc_score(y_test,model.predict(X_test_scaled))
# Now, plot the computed values
plt.plot(fpr, tpr, label='%s ROC (area = %0.2f)' % (m['label'], auc))
# Custom settings for the plot
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('1-Specificity(False Positive Rate)')
plt.ylabel('Sensitivity(True Positive Rate)')
plt.title('Receiver Operating Characteristic')
plt.legend(loc =(1.01,0))
plt.show() # Display